#--------------------------ANALIZA DANYCH ITP-------------------------------------------
#wczytanie danych
data<-read.table("mammographic_masses.data", header=FALSE, sep=",");
colnames(data) <- c('bi_rads', 'age', 'shape', 'margin', 'density', 'severity')


nan_data <- which(data=='?');
new_data <- replace(data, data=='?', NA); #zamiana ? na NA
new_data <- as.data.frame(lapply(new_data,as.numeric)) # konwersja wszystkich wartosci w dataframie na numeric

# Bindowanie kolorow do cech
featureColors <- c(bi_rads="#9ACD32",
                  age="#F08080",
                  shape="#B0E0E6",
                  margin="#DEB887",
                  density="#BA55D3",
                  severity="#66CDAA")

Analiza zbioru danych Mammographic Mass

Histogramy dla poszczególnych cech w zbiorze danych:

#-------------------------histogramy pierwsze---------------------

bi_rads_h <- hist(new_data$bi_rads, main="Ocena BI-RADS", xlab="Wartosc", ylab="Ilosc wystapien", xlim=c(0,max(new_data$bi_rads, na.rm = TRUE)), ylim=c(0, 600), breaks=56, col=featureColors['bi_rads'])

zero = which(bi_rads_h$counts == 0)
text(bi_rads_h$mids[-zero],bi_rads_h$counts[-zero],labels=bi_rads_h$counts[-zero], adj=c(0.5, -0.5))

age_h <- hist(new_data$age, main="Wiek pacjentki (w latach)", xlab="Wiek", ylab="Ilosc wystapien", xlim=c(0,100), breaks=99, col=featureColors['age'])

shape_h <- hist(new_data$shape, main="Ksztalt guza", xlab="Ksztalt", ylab="Ilosc wystapien", xlim=c(0,4), ylim=c(0, 450), breaks=0:4, col=featureColors['shape'], xaxt="n")
text(shape_h$mids,shape_h$counts,labels=shape_h$counts, adj=c(0.5, -0.5))
axis(1, shape_h$mids, c("round", "oval", "lobular", "irregular"), line=-1, lwd=0, lwd.ticks = 1)

margin_h <- hist(new_data$margin, main="Margines guza", xlab="Margines", ylab="Ilosc wystapien", xlim=c(0,max(new_data$margin, na.rm = TRUE)), ylim=c(0, 400), breaks=0:5, col=featureColors['margin'], xaxt="n")
text(margin_h$mids,margin_h$counts,labels=margin_h$counts, adj=c(0.5, -0.5))
axis(1, at=margin_h$mids, labels = rep_len("", length(margin_h$mids)), lwd=0, lwd.ticks = 1, line = -1)
text(margin_h$mids, par("usr")[3] - 0.2, labels = c("circumscribed", "microlobulated", "obscured", "ill-defined", "spiculated"),
     srt = 35, pos = 1, xpd = TRUE)

density_h <- hist(new_data$density, main="Gestosc guza", xlab="Gestosc guza", ylab="Ilosc wystapien", xlim=c(0,max(new_data$density, na.rm = TRUE)), ylim=c(0, 850), breaks=0:4, col=featureColors['density'], xaxt="n")
text(density_h$mids,density_h$counts,labels=density_h$counts, adj=c(0.5, -0.5))
axis(1, density_h$mids, c("high", "iso", "low", "fat-containing"), line=-1, lwd=0, lwd.ticks = 1)

severity_h <- hist(new_data$severity, right=F, main="Stopien zaawansowania guza", xlab="Zlosliwosc", ylab="Ilosc wystapien", xlim=c(0,1), ylim = c(0,600), breaks=2, col=featureColors['severity'], xaxt="n")
text(severity_h$mids,severity_h$counts,labels=severity_h$counts, adj=c(0.5, -0.5))
axis(1, severity_h$mids, c("benign", "malignant"), line=-1, lwd=0, lwd.ticks = 1)

Statystyki zbioru danych Mammographic Mass (z pominieciem wartości NA)

Definicja funkcji + obliczenie statystyk zbioru:

#--------funckje wazne----------------------------
getmode <- function(v, na.rm=TRUE) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

static_values <- function(v) {
  min <- min(v, na.rm = TRUE)
  max <- max(v, na.rm = TRUE)
  mean <- mean(v, na.rm = TRUE)
  sd <- sd(v, na.rm = TRUE)
  var <- var(v, na.rm = TRUE)
  median <- median(v, na.rm = TRUE)
  mode <- getmode(v, na.rm = TRUE)
  res <- c(min, max, mean, median, mode, sd, var)
  names(res) <-c('Min', 'Max', 'Mean', 'Median', 'Mode', 'Sd', 'Var')
  return(res)
}
#--------------------------------------------------
#wartosci statystyczne zbioru danych
(data_stats <- sapply(new_data, static_values))
##          bi_rads       age    shape   margin   density  severity
## Min     0.000000  18.00000 1.000000 1.000000 1.0000000 0.0000000
## Max    55.000000  96.00000 4.000000 5.000000 4.0000000 1.0000000
## Mean    4.348279  55.48745 2.721505 2.796276 2.9107345 0.4630593
## Median  4.000000  57.00000 3.000000 3.000000 3.0000000 0.0000000
## Mode    4.000000  59.00000 4.000000 1.000000 3.0000000 0.0000000
## Sd      1.783031  14.48013 1.242792 1.566546 0.3804439 0.4988932
## Var     3.179201 209.67419 1.544532 2.454065 0.1447376 0.2488944

Czyszczenie zbioru danych z wartosci NA oraz wartosci odstajacych

Wartosc bi_rads=55 powstala prawdopodobnie w wyniku bledu podczas tworzenia zbioru danych Mammographic Mass. Zakladamy, ze chodzilo o wartosc 5 i na taka podmieniamy ewidentnie bledna wartosc:

#------------------------USUNIECIE NIEKOMPLETNYCH DANYCH--------------------------------

data_clean <- new_data[complete.cases(new_data), ]


#dealing with element odstajacy
data_clean$bi_rads[data_clean$bi_rads==55] <- 5

Analiza zbioru danych Mammographic Mass po wyczyszczeniu danych

#-------------------------histogramy po czyszczeniu---------------------

bi_rads_new_h <- hist(data_clean$bi_rads, main="Ocena BI-RADS", xlab="Wartosc", ylab="Ilosc wystapien", xlim=c(0,6), ylim=c(0, 500), breaks=-0.5:6.5, col=featureColors['bi_rads'], xaxt="n")
text(bi_rads_new_h$mids,bi_rads_new_h$counts,labels=bi_rads_new_h$counts, adj=c(0.5, -0.5))
axis(1, bi_rads_new_h$mids, seq_along(bi_rads_new_h$mids))

age_new_h <- hist(data_clean$age, main="Wiek pacjentki (w latach)", xlab="Wiek", ylab="Ilosc wystapien", xlim=c(0,100), breaks=99, col=featureColors['age'])

shape_new_h <- hist(data_clean$shape, main="Ksztalt guza", xlab="Ksztalt", ylab="Ilosc wystapien", xlim=c(0,4), ylim=c(0,400), breaks=0:4, col=featureColors['shape'], xaxt="n")
text(shape_new_h$mids,shape_new_h$counts,labels=shape_new_h$counts, adj=c(0.5, -0.5))
axis(1, shape_new_h$mids, c("round", "oval", "lobular", "irregular"), line=-1, lwd=0, lwd.ticks = 1)

margin_new_h <- hist(data_clean$margin, main="Margines guza", xlab="Margines", ylab="Ilosc wystapien", xlim=c(0,max(data_clean$margin, na.rm = TRUE)), ylim=c(0,400), breaks=0:5, col=featureColors['margin'], xaxt="n")
text(margin_new_h$mids,margin_new_h$counts,labels=margin_new_h$counts, adj=c(0.5, -0.5))
axis(1, at=margin_new_h$mids, labels = rep_len("", length(margin_new_h$mids)), lwd=0, lwd.ticks = 1, line = -1)
text(margin_new_h$mids, par("usr")[3] - 0.2, labels = c("circumscribed", "microlobulated", "obscured", "ill-defined", "spiculated"),
     srt = 25, pos = 1, xpd = TRUE)

density_new_h <- hist(data_clean$density, main="Gestosc guza", xlab="Gestosc guza", ylab="Ilosc wystapien", xlim=c(0,max(data_clean$density, na.rm = TRUE)), ylim=c(0,800), breaks=0:4, col=featureColors['density'], xaxt="n")
text(density_new_h$mids,density_new_h$counts,labels=density_new_h$counts, adj=c(0.5, -0.5))
axis(1, density_h$mids, c("high", "iso", "low", "fat-containing"), line=-1, lwd=0, lwd.ticks = 1)

severity_new_h <- hist(data_clean$severity, right=F, main="Stopien zaawansowania guza", xlab="Zlosliwosc", ylab="Ilosc wystapien", xlim=c(0,1), ylim = c(0,500), breaks=2, col=featureColors['severity'], xaxt="n")
text(severity_new_h$mids,severity_new_h$counts,labels=severity_new_h$counts, adj=c(0.5, -0.5))
axis(1, severity_new_h$mids, c("benign", "malignant"), line=-1, lwd=0, lwd.ticks = 1)

Analiza zbioru pod katem determinant zlosliwosci guza

bi_rads_benVSmag <- as.matrix(t(prop.table(table(data_clean[c('bi_rads', 'severity')]), 1)))
bi_rads_benVSmag <- cbind(bi_rads_benVSmag[,1], c(0.0,0.0), bi_rads_benVSmag[,-1])
colnames(bi_rads_benVSmag) <- as.character(0:6)

age_benVSmag <- t(prop.table(table(data_clean[c('age', 'severity')]), 1))
shape_benVSmag <- t(prop.table(table(data_clean[c('shape', 'severity')]), 1))
margin_benVSmag <- t(prop.table(table(data_clean[c('margin', 'severity')]), 1))
density_benVSmag <- t(prop.table(table(data_clean[c('density', 'severity')]), 1))


# barploty
par(mar = c(5.1, 4.1, 4.1, 6))
b <- barplot(bi_rads_benVSmag, col = c("seagreen3", "firebrick3"),
             main = "Ocena BI-RADS vs. udzial guzow lagodnych/zlosliwych",
             xlab = "Ocena BI-RADS", ylab = "%",
             legend.text = c("benign", "malignant"),
             args.legend = list(x = "topright", inset = c(-0.2, 0)))
text(b[2], 0.48, "BRAK", col="tomato2", srt=90, font=2)

barplot(age_benVSmag, col = c("seagreen3", "firebrick3"),
        main = "Wiek pacjentki vs. udzial guzów lagodnych/zlosliwych",
        xlab = "Wiek pacjentki", ylab = "%",
        legend.text = c("benign", "malignant"),
        args.legend = list(x = "topright", inset = c(-0.2, 0)))

b <- barplot(shape_benVSmag, col = c("seagreen3", "firebrick3"),
             main = "Ksztalt masy vs. udzial guzow lagodnych/zlosliwych",
             xlab = "Ksztalt masy", ylab = "%", xaxt="n",
             legend.text = c("benign", "malignant"),
             args.legend = list(x = "topright", inset = c(-0.2, 0)))

axis(1, b, c("round", "oval", "lobular", "irregular"), line=-1, lwd=0, lwd.ticks = 1)

b <- barplot(margin_benVSmag, col = c("seagreen3", "firebrick3"),
             main = "Margines masy vs. udzial guzow lagodnych/zlosliwych",
             xlab = "Margines masy", ylab = "%", xaxt="n",
             legend.text = c("benign", "malignant"),
             args.legend = list(x = "topright", inset = c(-0.2, 0)))

axis(1, b, rep("", length(b)), line=-1, lwd=0, lwd.ticks = 1)
text(b, par("usr")[3], labels = c("circumscribed", "microlobulated", "obscured", "ill-defined", "spiculated"),
     srt = 25, pos = 1, xpd = TRUE)

b <- barplot(density_benVSmag, col = c("seagreen3", "firebrick3"),
             main = "Gestosc masy vs. udzial guzow lagodnych/zlosliwych",
             xlab = "Gestosc masy", ylab = "%", xaxt="n",
             legend.text = c("benign", "malignant"),
             args.legend = list(x = "topright", inset = c(-0.2, 0)))

axis(1, b, c("high", "iso", "low", "fat-containing"), line=-1, lwd=0, lwd.ticks = 1)

Macierz korelacji

require(reshape2)
require(plotly)
cormat <- round(cor(data_clean), 2);
melted_cormat <- melt(cormat);

# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
  cormat[upper.tri(cormat)] <- NA
  return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
  cormat[lower.tri(cormat)]<- NA
  return(cormat)
}

upper_tri <- get_upper_tri(cormat);

reorder_cormat <- function(cormat){
  # Use correlation between variables as distance
  dd <- as.dist((1-cormat)/2)
  hc <- hclust(dd)
  cormat <-cormat[hc$order, hc$order]
}

cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)

ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
      geom_tile(color = "white")+
      scale_fill_gradient2(low = "black", high = "firebrick2", mid = "dodgerblue4", 
                           midpoint = 0, limit = c(-1,1), space = "Lab", 
                           name="Pearson\nCorrelation") +
      theme_minimal()+ # minimal theme
      theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                       size = 12, hjust = 1))+
      theme(axis.text.y = element_text(size = 12)) + 
      coord_fixed()
    
    ggplotly(ggheatmap + 
      geom_text(aes(Var2, Var1, label = value), color = "white", size = 4) +
      theme(
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.ticks = element_blank(),
        legend.justification = c(1, 0),
        legend.position = c(0.6, 0.7),
        legend.direction = "horizontal")+
      guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                                   title.position = "top", title.hjust = 0.5)))

Standaryzacja zbioru danych

Do standardowego rozkładu normalnego N(0, 1), bez labeli (czyli bez cechy severity):

s <- scale(data_clean[,2:5])

scaled_centers <- attr(s, 'scaled:center')
scaled_scales <- attr(s, 'scaled:scale')

data_stand <- data.frame(age=s[,1], shape=s[,2], margin=s[,3], density=s[,4], data_clean['severity'])
head(data_stand, 10)
##            age      shape     margin    density severity
## 1   0.76460188  0.1755305  1.3953435  0.2403212        1
## 3   0.15117947  0.9804496  1.3953435  0.2403212        1
## 4  -1.89356190 -1.4343076 -1.1570204  0.2403212        0
## 9   0.08302143 -1.4343076  1.3953435  0.2403212        1
## 11  1.37802429 -1.4343076  0.7572525  0.2403212        1
## 12 -0.93934926 -0.6293885 -1.1570204  0.2403212        1
## 14 -1.34829753  0.1755305 -1.1570204 -2.6092013        0
## 15  0.28749556 -0.6293885 -1.1570204 -2.6092013        0
## 16 -0.12145271 -1.4343076 -1.1570204  0.2403212        0
## 17 -0.25776880  0.1755305  0.7572525  0.2403212        0

Utworzenie oraz trening sieci neuronowej

Na zbiorze treningowym.

library(rpart)
library(caret)
require(neuralnet)

#podział na zbior treningnowy i testowy
#sev <- unlist(data_clean['severity'])


sev <- data_clean$severity
set.seed(100)
part <- createDataPartition(sev, p = 0.8, list  = FALSE)

training_set <- scale(data_clean[part, 2:5])
scaled_centers <- attr(training_set, 'scaled:center')
scaled_scales <- attr(training_set, 'scaled:scale')

testing_set <- data_clean[-part, 2:5]

training_sev <-sev[part]
testing_sev <-sev[-part]



data_stand_tr <- data.frame(age=training_set[,1], shape=training_set[,2], margin=training_set[,3], density=training_set[,4], training_sev)
data_ts <- data.frame(age=testing_set[,1], shape=testing_set[,2], margin=testing_set[,3], density=testing_set[,4], testing_sev)

nn <- neuralnet(training_sev ~ age+shape+margin+density, data=data_stand_tr,
                err.fct = "sse", hidden = 2, act.fct = "logistic")

predsVStarget_tr <- data.frame(case=rownames(data_stand_tr),
                               predictions=factor(round(unlist(nn$net.result), digits = 0), labels = c('benign', 'malignant')),
                               target=factor(data_stand_tr$training_sev, labels = c('benign', 'malignant')))

pred <-round(predict(nn, scale(testing_set, center = scaled_centers, scale = scaled_scales)), 2)

predsVStarget_ts <-data.frame(case = rownames(testing_set), 
                              predictions=factor(round(pred, digits = 0), labels = c('benign', 'malignant')),
                              target = factor(data_ts$testing_sev, labels = c('benign', 'malignant')))

confMatrix_tr <- confusionMatrix(data=predsVStarget_tr$predictions, reference=predsVStarget_tr$target, positive = "malignant")
confMatrix_ts <- confusionMatrix(data=predsVStarget_ts$predictions, reference=predsVStarget_ts$target, positive = "malignant")

Macierze pomyłek

confMatrix_tr
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign       271        49
##   malignant     74       270
##                                           
##                Accuracy : 0.8148          
##                  95% CI : (0.7831, 0.8436)
##     No Information Rate : 0.5196          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.63            
##                                           
##  Mcnemar's Test P-Value : 0.03046         
##                                           
##             Sensitivity : 0.8464          
##             Specificity : 0.7855          
##          Pos Pred Value : 0.7849          
##          Neg Pred Value : 0.8469          
##              Prevalence : 0.4804          
##          Detection Rate : 0.4066          
##    Detection Prevalence : 0.5181          
##       Balanced Accuracy : 0.8160          
##                                           
##        'Positive' Class : malignant       
## 
fourfoldplot(confMatrix_tr$table, color = c("firebrick2", "seagreen3"),
             conf.level = 0, margin = 1, main = "Macierz pomylek na zbiorze trenigowym")

confMatrix_ts
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign        60         7
##   malignant     22        77
##                                           
##                Accuracy : 0.8253          
##                  95% CI : (0.7588, 0.8798)
##     No Information Rate : 0.506           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6498          
##                                           
##  Mcnemar's Test P-Value : 0.00933         
##                                           
##             Sensitivity : 0.9167          
##             Specificity : 0.7317          
##          Pos Pred Value : 0.7778          
##          Neg Pred Value : 0.8955          
##              Prevalence : 0.5060          
##          Detection Rate : 0.4639          
##    Detection Prevalence : 0.5964          
##       Balanced Accuracy : 0.8242          
##                                           
##        'Positive' Class : malignant       
## 
fourfoldplot(confMatrix_ts$table, color = c("firebrick2", "seagreen3"),
             conf.level = 0, margin = 1, main = "Macierz pomylek na zbiorze testowym")